# Project: Serbia Survey
# Authors: William O'Brochta and Patrick Cunha Silva
# Goal: STM Analysis on FB posts (Entire Corpus)

rm(list = ls())

# Load Raw Data (note raw data are not provided because raw data are from Meta)
load(file = here('fb_data.RData'))

########################################################
################## STM: All Pages ######################
########################################################

# Prepare corpus
mystopwords <- c('s', '#39', '<', '>', 'rs', 'www', 
                 '8d', 'iä', 'org', 'https', 'http', 'ä', 'dr', 't', 
                 stopwords('english'), stopwords('russian'))
doc_pros <- textProcessor(fb_data$translation, 
                          metadata = fb_data[, c('inCyrillic', 'owner', 
                                                 'leg_election', 'prez_election', 'Facebook.Id', 'year', 'id', 'date')], 
                          language = 'english',
                          removestopwords = TRUE, 
                          removenumbers = TRUE, 
                          removepunctuation = TRUE,
                          lowercase = TRUE, 
                          stem = FALSE, 
                          striphtml = TRUE, 
                          customstopwords = mystopwords,
                          onlycharacter = TRUE)

# Plot Removed Words
plotRemoved(doc_pros[[1]], lower.thresh = seq(from = 10, to = 1000, by = 10)) 

# Remove Words that appear in less than 1% of the documents or in more than 90% or less than 1%
stmObj <- prepDocuments(doc_pros[[1]], doc_pros[[2]], meta=doc_pros$meta
                        , lower.thresh = round(nrow(fb_data) * 0.01), 
                        upper.thresh = round(nrow(fb_data) * 0.90))

# Create object to run model
docs <- stmObj$documents
vocab <- stmObj$vocab
meta <- stmObj$meta

####################################################
############ SI B.2 - Number of Topics #############
####################################################

#--------------- Uncomment the following Lines to run the model again -------------#
# Search across number of topics
# storage <- searchK(documents = docs,
#     vocab = vocab,
#     K = seq(5, 60, 5),
#     prevalence = ~ inCyrillic +  leg_election + prez_election + year ,
#     data = meta,
#     # cores = 8
#     )

# Load results 
load(file = here("storage.RData"))

# We need to identify the number of topics. We use the euclidian distance
# Note that Exclusivity and Semanatic Coherence are in different scales
# Thus, we first scale them.
y = (unlist(storage$results$exclus))
x = (unlist(storage$results$semcoh))
y = scale(unlist(storage$results$exclus))
x = scale(unlist(storage$results$semcoh))
eucledian <- sqrt((y - max(y))^2 + (x - max(x))^2)
round(cbind(unlist(storage$results$K), y, x, eucledian), 3)

# Find number of topics
c(unlist(storage$results$K))[which.min(sqrt((y - max(y))^2 + (x - max(x))^2))]

# Find ideal number of topics
par(mar = c(6, 6, 2, 2))
plot(y = unlist(storage$results$exclus), x = unlist(storage$results$semcoh),
     ylab = 'Exclusivity', 
     xlab = 'Semantic Coherence', 
     pch = 1:length(seq(5, 60, 5)),
     cex.lab = 1.75, 
     cex.axis = 1.5)
legend('bottomleft', legend = c(unlist(storage$results$K)), 
       pch = 1:length(seq(5, 60, 5))) 
plot(y = y, x = x,
     ylab = 'Exclusivity', 
     xlab = 'Semantic Coherence', 
     pch = 1:length(seq(5, 60, 5)),
     cex.lab = 1.75, 
     cex.axis = 1.5)
legend('bottomleft', legend = c(unlist(storage$results$K)), 
       pch = 1:length(seq(5, 60, 5))) 
# K = 15 seems to be the best option, given that it is a good balance between 
# Semantic Coherence, Exclusivity, and Residual

######################################
############ Main Model ##############
######################################

#--------------- Uncomment Lines below to run the model again -------------#
# # Run STM 
# matDoc <- stm(documents = docs,
#     vocab = vocab,
#     K = 15,
#     data = meta,
# 	  prevalence = ~ inCyrillic + leg_election + prez_election + year,
#     init.type = 'Spectral'
# )
# # Save model
# save(matDoc, file = here('outModel_btw.RData'))

# Load model 
load(file = here('outModel.RData'))

# Label these topics
labelTopics(matDoc)

# Highest Prob: are the words within each topic with the highest probability 
# FREX: are the words that are both frequent and exclusive

# Highly associated documents
findThoughts(matDoc, texts = fb_data$translation[fb_data$id %in% meta$id], n = 2)


##### Topic Names
# Topic 1: Appearances on Media
# Topic 2: European Integration
# Topic 3: Local Governments
# Topic 4: Elections, Politics
# Topic 5: Belgrade
# Topic 6: Kosovo
# Topic 7: Opinions and General Talk
# Topic 8: Serbian Heritage
# Topic 9: Education, Schools
# Topic 10: Covid-19 (Economy)
# Topic 11: Government Affairs
# Topic 12: Environmental, Human, and Women's Right
# Topic 13: International Cooperation
# Topic 14: Serbian Presidency
# Topic 15: Police, Military


# Estimated Effect of Cyrillic
meta$year  <- as.character(meta$year)
out <- estimateEffect(formula = 1:15~ inCyrillic + leg_election + prez_election + year,  
                      metadata = meta, stmobj = matDoc, uncertainty = 'Global', prior = 1e-5)


# Create Figure
par(mar = c(5, 15, 1, 1))
plot(out, covariate = 'inCyrillic', 
     cov.value1 = 1, cov.value2 = 0, 
     method = 'difference', 
     labeltype = 'custom', 
     ci.level = 0.95,
     axes = FALSE,
     xlim = c(-0.09, 0.09),
     custom.labels = '',
     xlab = 'Difference in Topic Proportion',
     cex.lab = 1.5
)
mylabels <- c ("Media Appearances"
               , "European Integration"
               , "Local Governments"
               , "Elections/Politics"
               , "Belgrade"
               , "Kosovo"
               , "Opinions and General Talk"
               , "Serbian Heritage"
               , "Education, Schools"
               , "Economy/Covid-19"
               , "Government Affairs"
               , "Environmental, Human,\nand Women's Rights"
               , "International Cooperation"
               , "Serbian Presidency and SNS"
               , "Police/Military")
axis(2, at = 1:15, labels = rev(mylabels), las = 2, cex.axis = 1.25)
axis(1, at = c(-0.09, -0.045, 0, 0.045, 0.09), cex.axis = 1.15)

###################################################
############ SI B.5 - Complete Results ############
###################################################

# Create tables, we will do 3 tables, each will have 5 topics.
toTable <- matrix(NA, ncol = 5, nrow = 10) # Topics 1-5
toTable[seq(1, 8, 2), 1] <- round(unname(summary(out)[['tables']][[1]][1:4, 'Estimate']), 3) # Betas 1
toTable[seq(1, 8, 2), 2] <- round(unname(summary(out)[['tables']][[2]][1:4, 'Estimate']), 3) # Betas 2
toTable[seq(1, 8, 2), 3] <- round(unname(summary(out)[['tables']][[3]][1:4, 'Estimate']), 3) # Betas 3
toTable[seq(1, 8, 2), 4] <- round(unname(summary(out)[['tables']][[4]][1:4, 'Estimate']), 3) # Betas 4
toTable[seq(1, 8, 2), 5] <- round(unname(summary(out)[['tables']][[5]][1:4, 'Estimate']), 3) # Betas 5
toTable[seq(2, 9, 2), 1] <- paste0('(', round(unname(summary(out)[['tables']][[1]][1:4, 'Std. Error']), 3), ')') # SE 1
toTable[seq(2, 9, 2), 2] <- paste0('(', round(unname(summary(out)[['tables']][[2]][1:4, 'Std. Error']), 3), ')') # SE 2
toTable[seq(2, 9, 2), 3] <- paste0('(', round(unname(summary(out)[['tables']][[3]][1:4, 'Std. Error']), 3), ')') # SE 4
toTable[seq(2, 9, 2), 4] <- paste0('(', round(unname(summary(out)[['tables']][[4]][1:4, 'Std. Error']), 3), ')') # SE 4
toTable[seq(2, 9, 2), 5] <- paste0('(', round(unname(summary(out)[['tables']][[5]][1:4, 'Std. Error']), 3), ')') # SE 5
toTable[9, ] <- 'Yes'
toTable[10, ] <- nrow(meta)
colnames(toTable) <- paste("Topic", 1:5)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '', 'Presidential Election', '',
                       'FE by Year', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')

toTable <- matrix(NA, ncol = 5, nrow = 10) # Topics 6-10
toTable[seq(1, 8, 2), 1] <- round(unname(summary(out)[['tables']][[6]][1:4, 'Estimate']), 3) # Betas 6
toTable[seq(1, 8, 2), 2] <- round(unname(summary(out)[['tables']][[7]][1:4, 'Estimate']), 3) # Betas 7
toTable[seq(1, 8, 2), 3] <- round(unname(summary(out)[['tables']][[8]][1:4, 'Estimate']), 3) # Betas 8
toTable[seq(1, 8, 2), 4] <- round(unname(summary(out)[['tables']][[9]][1:4, 'Estimate']), 3) # Betas 9
toTable[seq(1, 8, 2), 5] <- round(unname(summary(out)[['tables']][[10]][1:4, 'Estimate']), 3) # Betas 10
toTable[seq(2, 9, 2), 1] <- paste0('(', round(unname(summary(out)[['tables']][[6]][1:4, 'Std. Error']), 3), ')') # SE 6
toTable[seq(2, 9, 2), 2] <- paste0('(', round(unname(summary(out)[['tables']][[7]][1:4, 'Std. Error']), 3), ')') # SE 7
toTable[seq(2, 9, 2), 3] <- paste0('(', round(unname(summary(out)[['tables']][[8]][1:4, 'Std. Error']), 3), ')') # SE 8
toTable[seq(2, 9, 2), 4] <- paste0('(', round(unname(summary(out)[['tables']][[9]][1:4, 'Std. Error']), 3), ')') # SE 9
toTable[seq(2, 9, 2), 5] <- paste0('(', round(unname(summary(out)[['tables']][[10]][1:4, 'Std. Error']), 3), ')') # SE 10
toTable[9, ] <- 'Yes'
toTable[10, ] <- nrow(meta)
colnames(toTable) <- paste("Topic", 6:10)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '', 'Presidential Election', '',
                       'FE by Year', 'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')

toTable <- matrix(NA, ncol = 5, nrow = 10) # Topics 11-15
toTable[seq(1, 8, 2), 1] <- round(unname(summary(out)[['tables']][[11]][1:4, 'Estimate']), 3) # Betas 11
toTable[seq(1, 8, 2), 2] <- round(unname(summary(out)[['tables']][[12]][1:4, 'Estimate']), 3) # Betas 12
toTable[seq(1, 8, 2), 3] <- round(unname(summary(out)[['tables']][[13]][1:4, 'Estimate']), 3) # Betas 13
toTable[seq(1, 8, 2), 4] <- round(unname(summary(out)[['tables']][[14]][1:4, 'Estimate']), 3) # Betas 14
toTable[seq(1, 8, 2), 5] <- round(unname(summary(out)[['tables']][[15]][1:4, 'Estimate']), 3) # Betas 15
toTable[seq(2, 9, 2), 1] <- paste0('(', round(unname(summary(out)[['tables']][[11]][1:4, 'Std. Error']), 3), ')') # SE 11
toTable[seq(2, 9, 2), 2] <- paste0('(', round(unname(summary(out)[['tables']][[12]][1:4, 'Std. Error']), 3), ')') # SE 12
toTable[seq(2, 9, 2), 3] <- paste0('(', round(unname(summary(out)[['tables']][[13]][1:4, 'Std. Error']), 3), ')') # SE 13
toTable[seq(2, 9, 2), 4] <- paste0('(', round(unname(summary(out)[['tables']][[14]][1:4, 'Std. Error']), 3), ')') # SE 14
toTable[seq(2, 9, 2), 5] <- paste0('(', round(unname(summary(out)[['tables']][[15]][1:4, 'Std. Error']), 3), ')') # SE 15
toTable[9, ] <- 'Yes'
toTable[10, ] <- nrow(meta)
colnames(toTable) <- paste("Topic", 11:15)
rownames(toTable) <- c('Intercept', '', 'Cyrillic', '',
                       'Legislative Election', '', 'Presidential Election', '',
                       'FE by Year',  'N')
stargazer::stargazer(toTable, style = 'apsr', type = 'text')
